In [2]:

    
%matplotlib notebook

import itertools
import logging
from functools import partial

import gensim
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pnd
from sklearn.cluster import *
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn.manifold import TSNE

from knub.thesis.util import *
matplotlib.style.use('ggplot')



In [10]:

    
pnd.set_option("display.max_colwidth", 100)

Topic Models → Topic Coherence, Concept Categorization

Evaluated using Palmetto tool from Exploring the Space of Topic Coherence Measures paper Values still seem low compared to example values from the paper



In [4]:

    
df_tc_results = pnd.DataFrame([
        ("topic.full.alpha-1-100.256-400.model", 0.469500859375, 0.00617111859067, 0.6463414634146342),
        ("topic.16-400.model", 0.43805875, 0.00390183951094, 0.5975609756097561),
        ("topic.256-1000.model", 0.473455351563, 0.00635883046394, 0.5853658536585366),
        ("topic.64-400.model", 0.45327734375, 0.00385141007263, 0.6341463414634146),
        ("topic.256-400.model", 0.46836359375, 0.00599032492068, 0.5731707317073171),
        ("topic.full.fixed-vocabulary.alpha-1-100.256-400.model", 0.468437070312, 0.00562772603243, 0.5975609756097561),
        ("topic.full.256-400.model", 0.472498945313, 0.00624853749772, 0.5975609756097561),
        ("topic.256-600.model", 0.478640273437, 0.00685787139094, 0.5609756097560975)
    ], 
    columns=["Topic model parameters", "TC_mean", "TC_var", "CC_purity"])
del df_tc_results["CC_purity"]



In [5]:

    
df_tc_results.sort_values(by="TC_mean", ascending=False)









    Out[5]:






  
    
      
      Topic model parameters
      TC_mean
      TC_var
    
  
  
    
      7
      topic.256-600.model
      0.478640
      0.006858
    
    
      2
      topic.256-1000.model
      0.473455
      0.006359
    
    
      6
      topic.full.256-400.model
      0.472499
      0.006249
    
    
      0
      topic.full.alpha-1-100.256-400.model
      0.469501
      0.006171
    
    
      5
      topic.full.fixed-vocabulary.alpha-1-100.256-40...
      0.468437
      0.005628
    
    
      4
      topic.256-400.model
      0.468364
      0.005990
    
    
      3
      topic.64-400.model
      0.453277
      0.003851
    
    
      1
      topic.16-400.model
      0.438059
      0.003902



In [7]:

    
df_tc_results.sort_values(by="TC_var", ascending=False)









    Out[7]:






  
    
      
      Topic model parameters
      TC_mean
      TC_var
    
  
  
    
      7
      topic.256-600.model
      0.478640
      0.006858
    
    
      2
      topic.256-1000.model
      0.473455
      0.006359
    
    
      6
      topic.full.256-400.model
      0.472499
      0.006249
    
    
      0
      topic.full.alpha-1-100.256-400.model
      0.469501
      0.006171
    
    
      4
      topic.256-400.model
      0.468364
      0.005990
    
    
      5
      topic.full.fixed-vocabulary.alpha-1-100.256-40...
      0.468437
      0.005628
    
    
      1
      topic.16-400.model
      0.438059
      0.003902
    
    
      3
      topic.64-400.model
      0.453277
      0.003851



In [12]:

    
df_tc_results_2 = pnd.read_csv("../models/topic_models_coherence_2.tsv", sep="\t", index_col=None)
df_tc_results_2.sort_values(by="TC_mean", ascending=False)









    Out[12]:






  
    
      
      model
      TC_mean
      TC_var
    
  
  
    
      8
      topic.256-400.first-2000.alpha-0-1.beta-0-1.model.ssv
      0.495
      0.095
    
    
      5
      topic.256-400.first-2000.alpha-0-01.beta-0-1.model.ssv
      0.494
      0.093
    
    
      2
      topic.256-400.first-2000.alpha-0-002.beta-0-1.model.ssv
      0.478
      0.084
    
    
      7
      topic.256-400.first-2000.alpha-0-1.beta-0-01.model.ssv
      0.476
      0.086
    
    
      4
      topic.256-400.first-2000.alpha-0-01.beta-0-01.model.ssv
      0.475
      0.083
    
    
      6
      topic.256-400.first-2000.alpha-0-1.beta-0-002.model.ssv
      0.475
      0.083
    
    
      0
      topic.256-400.first-2000.alpha-0-002.beta-0-002.model.ssv
      0.470
      0.079
    
    
      1
      topic.256-400.first-2000.alpha-0-002.beta-0-01.model.ssv
      0.470
      0.079
    
    
      3
      topic.256-400.first-2000.alpha-0-01.beta-0-002.model.ssv
      0.469
      0.079
    
    
      9
      embedding.model.skip-gram.ssv
      0.466
      0.123
    
    
      10
      embedding.model.cbow.ssv
      0.433
      0.067

Word Embeddings → Analogy Reasoning

Using manual set parameters

Using the question word data set (~19k questions) from Efficient Estimation of Word Representations in Vector Space (word2vec).



In [8]:

    
df_ar_results = pnd.DataFrame([
        ("embedding.skip-gram.size-200.window-5.negative-5.model", 0.481221858371),
        ("embedding.cbow.size-200.window-5.model", 0.416547277937),
        ("embedding.google.size-300", 0.735878018829),
    ], 
    columns=["Word Embeddings", "Analogy_Reasoning"])

df_ar_results.sort_values(by="Analogy_Reasoning", ascending=False)









    Out[8]:






  
    
      
      Word Embeddings
      Analogy_Reasoning
    
  
  
    
      2
      embedding.google.size-300
      0.735878
    
    
      0
      embedding.skip-gram.size-200.window-5.negative...
      0.481222
    
    
      1
      embedding.cbow.size-200.window-5.model
      0.416547

Using Spearmint

Testing only skip-gram architecture.



In [12]:

    
df_ar_spearmint_results = pnd.read_csv("../code/python/knub/thesis/spearmint_analogy_reasoning/results.csv", index_col="model")
df_ar_spearmint_results.sort_values(by="Analogy_Reasoning", ascending=False)









    Out[12]:






  
    
      
      sample
      window
      negative
      size
      Analogy_Reasoning
    
    
      model
      
      
      
      
      
    
  
  
    
      4
      0.010000
      8
      18
      476
      0.713262
    
    
      5
      0.000004
      6
      20
      600
      0.712955
    
    
      3
      0.005000
      5
      12
      325
      0.707941
    
    
      1
      0.000000
      3
      5
      50
      0.384977

	Topic model parameters	TC_mean	TC_var
7	topic.256-600.model	0.478640	0.006858
2	topic.256-1000.model	0.473455	0.006359
6	topic.full.256-400.model	0.472499	0.006249
0	topic.full.alpha-1-100.256-400.model	0.469501	0.006171
5	topic.full.fixed-vocabulary.alpha-1-100.256-40...	0.468437	0.005628
4	topic.256-400.model	0.468364	0.005990
3	topic.64-400.model	0.453277	0.003851
1	topic.16-400.model	0.438059	0.003902

	model	TC_mean	TC_var
8	topic.256-400.first-2000.alpha-0-1.beta-0-1.model.ssv	0.495	0.095
5	topic.256-400.first-2000.alpha-0-01.beta-0-1.model.ssv	0.494	0.093
2	topic.256-400.first-2000.alpha-0-002.beta-0-1.model.ssv	0.478	0.084
7	topic.256-400.first-2000.alpha-0-1.beta-0-01.model.ssv	0.476	0.086
4	topic.256-400.first-2000.alpha-0-01.beta-0-01.model.ssv	0.475	0.083
6	topic.256-400.first-2000.alpha-0-1.beta-0-002.model.ssv	0.475	0.083
0	topic.256-400.first-2000.alpha-0-002.beta-0-002.model.ssv	0.470	0.079
1	topic.256-400.first-2000.alpha-0-002.beta-0-01.model.ssv	0.470	0.079
3	topic.256-400.first-2000.alpha-0-01.beta-0-002.model.ssv	0.469	0.079
9	embedding.model.skip-gram.ssv	0.466	0.123
10	embedding.model.cbow.ssv	0.433	0.067

	Word Embeddings	Analogy_Reasoning
2	embedding.google.size-300	0.735878
0	embedding.skip-gram.size-200.window-5.negative...	0.481222
1	embedding.cbow.size-200.window-5.model	0.416547

	sample	window	negative	size	Analogy_Reasoning
model
4	0.010000	8	18	476	0.713262
5	0.000004	6	20	600	0.712955
3	0.005000	5	12	325	0.707941
1	0.000000	3	5	50	0.384977